package belloCollector.test;

import java.io.File;
import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.concurrent.atomic.AtomicInteger;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;

public class JokeCrawler extends BreadthCrawler {

	//用于保存图片的文件夹
    File downloadDir;
    
    //原子性int，用于生成图片文件名
    AtomicInteger imageId;
    
    /**
     * 
     * @param crawlPath 用于维护URL的文件夹
     * @param downloadPath 用于保存图片的文件夹
     */
    public JokeCrawler(String crawlPath, String downloadPath) {
        super(crawlPath, true);
        downloadDir = new File(downloadPath);
        if(!downloadDir.exists()){
            downloadDir.mkdirs();
        }
        computeImageId();
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
    	if (page.matchUrl("http://bbs.hupu.com/[0-9]+.html")) {
            String title = page.select("div[class=bbs-hd-h1]").first().text();
            if(title.contains("不冷笑话")){
            	//如果是网页，则抽取其中包含图片的URL，放入后续任务
                Elements imgs = page.select("img[src]");
                for (Element img : imgs) {
                    String imgSrc = img.attr("abs:src");
                    next.add(imgSrc);
                }
            }
        }else {
        	String contentType = page.getResponse().getContentType();
        	if (contentType.endsWith("gif")){
        		//如果是图片，直接下载
                String extensionName=contentType.split("/")[1];
                String imageFileName=imageId.incrementAndGet()+"."+extensionName;
                File imageFile=new File(downloadDir,imageFileName);
                try {
                    FileUtils.writeFile(imageFile, page.getContent());
                    System.out.println("保存图片 "+page.getUrl()+" 到 "+imageFile.getAbsolutePath());
                } catch (IOException ex) {
                    throw new RuntimeException(ex);
                }
        	}
        }
    }

    public static void main(String[] args) throws Exception {
    	String date = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE);
    	JokeCrawler jokeCrawler = new JokeCrawler("../../result/"+date+"_bxjJoke", "../../result/"+date+"_jokeImgDownload");
        //添加种子URL
    	//jokeCrawler.addSeed("http://bbs.hupu.com/bxj");
    	for(int i=1;i<=10;i++){
    		jokeCrawler.addSeed("http://my.hupu.com/search?q=%A1%B6%B2%BB%C0%E4%D0%A6%BB%B0%A1%B7%B5%DA&type=topic&fid=34&sortby=datedesc&page="+i);
    	}
        //限定爬取范围
    	jokeCrawler.addRegex("http://bbs.hupu.com/[0-9]+.html");
        //设置为断点爬取，否则每次开启爬虫都会重新爬取
    	jokeCrawler.setResumable(true);
    	jokeCrawler.setThreads(30);
        Config.MAX_RECEIVE_SIZE = 1000 * 1000 * 10;
        jokeCrawler.start(4);
    }
    
    public void computeImageId(){
        int maxId=-1;
        for(File imageFile:downloadDir.listFiles()){
            String fileName=imageFile.getName();
            String idStr=fileName.split("\\.")[0];
            int id=Integer.valueOf(idStr);
            if(id>maxId){
                maxId=id;
            }
        }
        imageId=new AtomicInteger(maxId);
    }

}
